#write in 2019,6,16
# write by lidabao
import requests
import re
import time

kv = {
    'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1'
}
s=set()

def get(url):
    global s
    try:
        res = requests.get(url,headers=kv)
        res.encoding = res.apparent_encoding
        p = re.compile(r'https://www.cnblogs.com/.*?/p/[0-9]{8}\.html')
        urls = p.findall(res.text)
        print(urls)
        with open('blog_urls','a') as f:
            for i in urls:
                f.write(i+'\n')
        urls=set(urls)
        s=s.union(urls)
    except:
        print(url,"爬取失败")

def go():
    global s
    url=s.pop()
    while url:
        get(url)
        print('*'*30,s,'*'*30)
        time.sleep(1)
        url=s.pop()

if __name__ == "__main__":
    s.add("https://www.cnblogs.com/")
    go()

